This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
data <- data.frame(read.csv("googleplaystore.csv"))
library(ggplot2)
#Display all the duplicated Apps
duplicate_apps <- aggregate(App ~ ., data = data, FUN = length)
duplicate_apps <- duplicate_apps[duplicate_apps$App > 1, ]
duplicate_apps <- duplicate_apps[order(-duplicate_apps$App), ]
#View(duplicate_apps)
#print(duplicate_apps)
print(paste("Number of duplicated Apps:",nrow(duplicate_apps)))
## [1] "Number of duplicated Apps: 404"
#Removing Na values and duplicates
data_clean <- data[!is.na(data$App), ]
data_clean <- data_clean[!duplicated(data_clean$App), ]
#(After removing the duplicates)Unique values
unique_apps <- length(unique(data_clean$App))
print(paste("Number of unique apps after removing the duplicates:", unique_apps))
## [1] "Number of unique apps after removing the duplicates: 9660"
Nearly 404 apps have been repeated twice and thrice. After removing all the duplicated app names, there are 9660 unique apps in the data frame. (1181 values removed)
Below is the dataframe with number of unique values and NA value for each variables in the dataset after removing the duplicates.
#DataFrame includes unique values and Na for all variables in data after removing duplicates
unique_values_list <- lapply(data_clean, unique)
unique_counts_list <- lapply(data_clean, function(col) length(unique(col)))
null_counts_list <- lapply(data_clean, function(col) sum(is.na(col)))
unique_df <- data.frame(
Unique_Values = sapply(unique_values_list, function(x) paste(x, collapse = ", ")),
Unique_Counts = unlist(unique_counts_list),
Null_Counts = unlist(null_counts_list)
)
typeof(data_clean$Price)
## [1] "character"
Convert Price to numerical is required
#To check if there is dollar symbol present
#data_clean$Price[]
# Remove dollar symbols and convert to numeric
data_clean$Price <- as.numeric(gsub("\\$", "", data_clean$Price))
#Recheck for dollar symbol
#data_clean$Price[]
# Summary statistics for price
summary(data_clean$Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 0.000 0.000 1.099 0.000 400.000 1
From the unique_df, there is a missing value present in the Price column.
#Checking for missing values in Price
missing_na <- is.na(data_clean$Price)
missing_blank <- data_clean$Price == ""
sum(missing_na)
## [1] 1
sum(missing_blank, na.rm = TRUE)
## [1] 0
# Remove row where Price is NA or blank
data_clean <- data_clean[!is.na(data_clean$Price) & data_clean$Price != "", ]
Have removed one row #10473 which app does not have a category name.(not required)
#Recheck for missing values
summary(data_clean$Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 1.099 0.000 400.000
Missing values removed succesfully.(Price)
#Checking the distribution of prices using histogram
library(ggplot2)
ggplot(data_clean, aes(x=Price)) +
geom_histogram(binwidth=2, fill="pink", color="black") +
xlim(0, 500) + ylim(0, 500) +
labs(title="Price Distribution", x="Price", y="Frequency") +
theme_minimal()
The data is highly skewed as there are many zero price entries.
# Boxplot for the same
ggplot(data_clean, aes(y=Price)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 1, fill="pink", color="black") +
labs(title="Price Boxplot", y="Price") +
theme_minimal()
outlierKD2 <- function(df, var, rm = FALSE, boxplt = FALSE, histogram = TRUE, qqplt = FALSE) {
dt <- df # Duplicate the dataframe for potential alteration
var_name <- eval(substitute(var), eval(dt))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = TRUE)
colTotal <- boxplt + histogram + qqplt # Calculate the total number of charts to be displayed
par(mfrow = c(2, max(2, colTotal)), oma = c(0, 0, 3, 0)) # Adjust layout for plots
# Q-Q plot with custom title
if (qqplt) {
qqnorm(var_name, main="Q-Q plot without Outliers")
qqline(var_name)
}
# Histogram with custom title
if (histogram) {
hist(var_name,main = "Histogram without Outliers", xlab = NA, ylab = NA)
}
# Box plot with custom title
if (boxplt) {
boxplot(var_name, main= "Box Plot without Outliers")
}
# Identify outliers
outlier <- boxplot.stats(var_name)$out
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
# Q-Q plot without outliers
if (qqplt) {
qqnorm(var_name, main="Q-Q plot with Outliers")
qqline(var_name)
}
# Histogram without outliers
if (histogram) {
hist(var_name, main = "Histogram with Outliers", xlab = NA, ylab = NA)
}
# Box plot without outliers
if (boxplt) {
boxplot(var_name, main = "Boxplot with Outliers")
}
# Add the title for the overall plot section if any plots are displayed
if (colTotal > 0) {
title("Outlier Check", outer = TRUE)
na2 <- sum(is.na(var_name))
cat("Outliers identified:", na2 - na1, "\n")
cat("Proportion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name)) * 100, 1), "\n")
cat("Mean of the outliers:", round(mo, 2), "\n")
cat("Mean without removing outliers:", round(m1, 2), "\n")
cat("Mean if we remove outliers:", round(mean(var_name, na.rm = TRUE), 2), "\n")
}
# Remove outliers if `rm = TRUE`
if (rm) {
dt[as.character(substitute(var))] <- invisible(var_name)
cat("Outliers successfully removed", "\n")
return(invisible(dt))
} else {
cat("Nothing changed", "\n")
return(invisible(df))
}
}
outlier_check_price = outlierKD2(data_clean, Price, rm = FALSE, boxplt = TRUE, qqplt = TRUE)
## Outliers identified: 756
## Proportion (%) of outliers: 8.5
## Mean of the outliers: 14.05
## Mean without removing outliers: 1.1
## Mean if we remove outliers: 0
## Nothing changed
The price values here are valid observations for our analysis(both typical and extreme values), so removing these outliers might not be useful.
#To check the value ranges
table(data_clean$Price)
##
## 0 0.99 1 1.04 1.2 1.26 1.29 1.49 1.5 1.59 1.61
## 8903 145 3 1 1 1 1 46 1 1 1
## 1.7 1.75 1.76 1.96 1.97 1.99 2 2.49 2.5 2.56 2.59
## 2 1 1 1 1 73 3 25 1 1 1
## 2.6 2.9 2.95 2.99 3.02 3.04 3.08 3.28 3.49 3.61 3.88
## 1 1 1 124 1 1 1 1 7 1 1
## 3.9 3.95 3.99 4.29 4.49 4.59 4.6 4.77 4.8 4.84 4.85
## 1 1 57 1 9 1 1 1 1 1 1
## 4.99 5 5.49 5.99 6.49 6.99 7.49 7.99 8.49 8.99 9
## 70 1 5 26 5 11 2 7 2 5 1
## 9.99 10 10.99 11.99 12.99 13.99 14 14.99 15.46 15.99 16.99
## 19 2 2 3 4 2 1 9 1 1 2
## 17.99 18.99 19.4 19.9 19.99 24.99 25.99 28.99 29.99 30.99 33.99
## 2 1 1 1 5 3 1 1 5 1 1
## 37.99 39.99 46.99 74.99 79.99 89.99 109.99 154.99 200 299.99 379.99
## 1 2 1 1 1 1 1 1 1 1 1
## 389.99 394.99 399.99 400
## 1 1 12 1
table(data_clean$Type)
##
## Free Paid
## 8902 756
#Missing values
print(paste("Missing values:",sum(is.na(data_clean$Type))))
## [1] "Missing values: 0"
data_clean[is.na(data_clean$Type), ]
## [1] App Category Rating Reviews Size
## [6] Installs Type Price Content.Rating Genres
## [11] Last.Updated Current.Ver Android.Ver
## <0 rows> (or 0-length row.names)
There is one row 9150, has a missing value for Type. As the price is 0, replaced it with “Free”.
# Replace NaN or missing values in the Type column with "Free"
data_clean$Type[is.na(data_clean$Type)] <- "Free"
ggplot(data_clean, aes(x = Type)) +
geom_bar(fill = "pink", color = "black") +
labs(title = "Distribution of App Types (Free vs Paid)", x = "Type", y = "Count") +
theme_minimal()
data_clean$Type <- as.factor(data_clean$Type)
summary_by_type <- data.frame(
Type = levels(data_clean$Type),
Min_Price = tapply(data_clean$Price, data_clean$Type, min, na.rm = TRUE),
Max_Price = tapply(data_clean$Price, data_clean$Type, max, na.rm = TRUE),
Mean_Price = tapply(data_clean$Price, data_clean$Type, mean, na.rm = TRUE),
Median_Price = tapply(data_clean$Price, data_clean$Type, median, na.rm = TRUE)
)
print(summary_by_type)
## Type Min_Price Max_Price Mean_Price Median_Price
## Free Free 0.00 0 0.00000 0.00
## NaN NaN 0.00 0 0.00000 0.00
## Paid Paid 0.99 400 14.04515 2.99
ggplot(data_clean, aes(x = Type, y = Price, fill = Type)) +
geom_boxplot() +
labs(title = "Price Distribution by App Type",
x = "App Type",
y = "Price ($)") +
theme_minimal()
ggplot(data_clean, aes(x = Price, fill = Type)) +
geom_histogram(binwidth = 60, alpha = 0.7, position = "identity") +
facet_wrap(~ Type) +
labs(title = "Price Distribution by App Type",
x = "Price ($)",
y = "Count") +
theme_minimal()
Here, by analysing the price distribution by app types, there are some incorrect values in the Type column that are not correctly representing the price of the apps. Hence, as we can completely relu on the prices, the type column is not required for our analysis.
Removing Type column…
#Using subset function
data_clean <- subset(data_clean, select = -Type)
str(data_clean)
## 'data.frame': 9659 obs. of 12 variables:
## $ App : chr "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite – FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
## $ Category : chr "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
## $ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
## $ Reviews : chr "159" "967" "87510" "215644" ...
## $ Size : chr "19M" "14M" "8.7M" "25M" ...
## $ Installs : chr "10,000+" "500,000+" "5,000,000+" "50,000,000+" ...
## $ Price : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Content.Rating: chr "Everyone" "Everyone" "Everyone" "Teen" ...
## $ Genres : chr "Art & Design" "Art & Design;Pretend Play" "Art & Design" "Art & Design" ...
## $ Last.Updated : chr "January 7, 2018" "January 15, 2018" "August 1, 2018" "June 8, 2018" ...
## $ Current.Ver : chr "1.0.0" "2.0.0" "1.2.4" "Varies with device" ...
## $ Android.Ver : chr "4.0.3 and up" "4.0.3 and up" "4.0.3 and up" "4.2 and up" ...
head(data_clean)
## App Category Rating
## 1 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1
## 2 Coloring book moana ART_AND_DESIGN 3.9
## 3 U Launcher Lite – FREE Live Cool Themes, Hide Apps ART_AND_DESIGN 4.7
## 4 Sketch - Draw & Paint ART_AND_DESIGN 4.5
## 5 Pixel Draw - Number Art Coloring Book ART_AND_DESIGN 4.3
## 6 Paper flowers instructions ART_AND_DESIGN 4.4
## Reviews Size Installs Price Content.Rating Genres
## 1 159 19M 10,000+ 0 Everyone Art & Design
## 2 967 14M 500,000+ 0 Everyone Art & Design;Pretend Play
## 3 87510 8.7M 5,000,000+ 0 Everyone Art & Design
## 4 215644 25M 50,000,000+ 0 Teen Art & Design
## 5 967 2.8M 100,000+ 0 Everyone Art & Design;Creativity
## 6 167 5.6M 50,000+ 0 Everyone Art & Design
## Last.Updated Current.Ver Android.Ver
## 1 January 7, 2018 1.0.0 4.0.3 and up
## 2 January 15, 2018 2.0.0 4.0.3 and up
## 3 August 1, 2018 1.2.4 4.0.3 and up
## 4 June 8, 2018 Varies with device 4.2 and up
## 5 June 20, 2018 1.1 4.4 and up
## 6 March 26, 2017 1.0 2.3 and up
The Type column is successfully removed.
head(data_clean)
## App Category Rating
## 1 Photo Editor & Candy Camera & Grid & ScrapBook ART_AND_DESIGN 4.1
## 2 Coloring book moana ART_AND_DESIGN 3.9
## 3 U Launcher Lite – FREE Live Cool Themes, Hide Apps ART_AND_DESIGN 4.7
## 4 Sketch - Draw & Paint ART_AND_DESIGN 4.5
## 5 Pixel Draw - Number Art Coloring Book ART_AND_DESIGN 4.3
## 6 Paper flowers instructions ART_AND_DESIGN 4.4
## Reviews Size Installs Price Content.Rating Genres
## 1 159 19M 10,000+ 0 Everyone Art & Design
## 2 967 14M 500,000+ 0 Everyone Art & Design;Pretend Play
## 3 87510 8.7M 5,000,000+ 0 Everyone Art & Design
## 4 215644 25M 50,000,000+ 0 Teen Art & Design
## 5 967 2.8M 100,000+ 0 Everyone Art & Design;Creativity
## 6 167 5.6M 50,000+ 0 Everyone Art & Design
## Last.Updated Current.Ver Android.Ver
## 1 January 7, 2018 1.0.0 4.0.3 and up
## 2 January 15, 2018 2.0.0 4.0.3 and up
## 3 August 1, 2018 1.2.4 4.0.3 and up
## 4 June 8, 2018 Varies with device 4.2 and up
## 5 June 20, 2018 1.1 4.4 and up
## 6 March 26, 2017 1.0 2.3 and up
tail(data_clean)
## App Category Rating
## 10836 FR Forms BUSINESS NaN
## 10837 Sya9a Maroc - FR FAMILY 4.5
## 10838 Fr. Mike Schmitz Audio Teachings FAMILY 5.0
## 10839 Parkinson Exercices FR MEDICAL NaN
## 10840 The SCP Foundation DB fr nn5n BOOKS_AND_REFERENCE 4.5
## 10841 iHoroscope - 2018 Daily Horoscope & Astrology LIFESTYLE 4.5
## Reviews Size Installs Price Content.Rating
## 10836 0 9.6M 10+ 0 Everyone
## 10837 38 53M 5,000+ 0 Everyone
## 10838 4 3.6M 100+ 0 Everyone
## 10839 3 9.5M 1,000+ 0 Everyone
## 10840 114 Varies with device 1,000+ 0 Mature 17+
## 10841 398307 19M 10,000,000+ 0 Everyone
## Genres Last.Updated Current.Ver
## 10836 Business September 29, 2016 1.1.5
## 10837 Education July 25, 2017 1.48
## 10838 Education July 6, 2018 1.0
## 10839 Medical January 20, 2017 1.0
## 10840 Books & Reference January 19, 2015 Varies with device
## 10841 Lifestyle July 25, 2018 Varies with device
## Android.Ver
## 10836 4.0 and up
## 10837 4.1 and up
## 10838 4.1 and up
## 10839 2.2 and up
## 10840 Varies with device
## 10841 Varies with device
str(data_clean)
## 'data.frame': 9659 obs. of 12 variables:
## $ App : chr "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite – FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
## $ Category : chr "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
## $ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
## $ Reviews : chr "159" "967" "87510" "215644" ...
## $ Size : chr "19M" "14M" "8.7M" "25M" ...
## $ Installs : chr "10,000+" "500,000+" "5,000,000+" "50,000,000+" ...
## $ Price : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Content.Rating: chr "Everyone" "Everyone" "Everyone" "Teen" ...
## $ Genres : chr "Art & Design" "Art & Design;Pretend Play" "Art & Design" "Art & Design" ...
## $ Last.Updated : chr "January 7, 2018" "January 15, 2018" "August 1, 2018" "June 8, 2018" ...
## $ Current.Ver : chr "1.0.0" "2.0.0" "1.2.4" "Varies with device" ...
## $ Android.Ver : chr "4.0.3 and up" "4.0.3 and up" "4.0.3 and up" "4.2 and up" ...
Now that the price and Apps cleaning and Analysis is done.Now lets proceed with Ratings and Reviews.
## chr [1:9659] "159" "967" "87510" "215644" "967" "167" "178" "36815" ...
## num [1:9659] 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
As we can see the Review column is in string format which could be converted into int for more insights
##Change the column reviews from Str to int
## 'data.frame': 9659 obs. of 12 variables:
## $ App : chr "Photo Editor & Candy Camera & Grid & ScrapBook" "Coloring book moana" "U Launcher Lite – FREE Live Cool Themes, Hide Apps" "Sketch - Draw & Paint" ...
## $ Category : chr "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" "ART_AND_DESIGN" ...
## $ Rating : num 4.1 3.9 4.7 4.5 4.3 4.4 3.8 4.1 4.4 4.7 ...
## $ Reviews : num 159 967 87510 215644 967 ...
## $ Size : chr "19M" "14M" "8.7M" "25M" ...
## $ Installs : chr "10,000+" "500,000+" "5,000,000+" "50,000,000+" ...
## $ Price : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Content.Rating: chr "Everyone" "Everyone" "Everyone" "Teen" ...
## $ Genres : chr "Art & Design" "Art & Design;Pretend Play" "Art & Design" "Art & Design" ...
## $ Last.Updated : chr "January 7, 2018" "January 15, 2018" "August 1, 2018" "June 8, 2018" ...
## $ Current.Ver : chr "1.0.0" "2.0.0" "1.2.4" "Varies with device" ...
## $ Android.Ver : chr "4.0.3 and up" "4.0.3 and up" "4.0.3 and up" "4.2 and up" ...
| App | Category | Rating | Reviews | Size | Installs | Price | Content.Rating | Genres | Last.Updated | Current.Ver | Android.Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Length:9659 | Length:9659 | Min. :1.000 | Min. : 0 | Length:9659 | Length:9659 | Min. : 0.000 | Length:9659 | Length:9659 | Length:9659 | Length:9659 | Length:9659 |
| Q1 | Class :character | Class :character | 1st Qu.:4.000 | 1st Qu.: 25 | Class :character | Class :character | 1st Qu.: 0.000 | Class :character | Class :character | Class :character | Class :character | Class :character |
| Median | Mode :character | Mode :character | Median :4.300 | Median : 967 | Mode :character | Mode :character | Median : 0.000 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character |
| Mean | NA | NA | Mean :4.173 | Mean : 216593 | NA | NA | Mean : 1.099 | NA | NA | NA | NA | NA |
| Q3 | NA | NA | 3rd Qu.:4.500 | 3rd Qu.: 29401 | NA | NA | 3rd Qu.: 0.000 | NA | NA | NA | NA | NA |
| Max | NA | NA | Max. :5.000 | Max. :78158306 | NA | NA | Max. :400.000 | NA | NA | NA | NA | NA |
| NA | NA | NA | NA’s :1463 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
There are 1463 missing values in rating.
#Distribution of NA Ratings by Category.
df_na_rating <- data_clean %>% filter(is.na(Rating))
# Group by Category and count the number of NA ratings for each category
na_rating_distribution <- df_na_rating %>%
group_by(Category) %>%
summarise(count = n()) %>%
arrange(desc(count))
ggplot(na_rating_distribution, aes(x = reorder(Category, -count), y = count)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = count),
position = position_stack(vjust = 0.5), # Center the text within the bars
color = "white", size = 3) + # Adjust text color and size
coord_flip() +
theme_minimal() +
labs(title = "Distribution of NA Ratings by Category",
x = "Category",
y = "Count of NA Ratings") +
theme(axis.text.y = element_text(size = 8))
As it could observed the Family category apps have the highest NA
values.
#We can replace the NA values with the average instead of removing them to extract more information
library(dplyr)
# Method 1: Replace NA in Ratings with Overall Mean
data_clean1 <- data_clean %>%
mutate(Rating = ifelse(is.na(Rating), mean(Rating, na.rm = TRUE), Rating))
xkablesummary(data_clean1)
| App | Category | Rating | Reviews | Size | Installs | Price | Content.Rating | Genres | Last.Updated | Current.Ver | Android.Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Length:9659 | Length:9659 | Min. :1.000 | Min. : 0 | Length:9659 | Length:9659 | Min. : 0.000 | Length:9659 | Length:9659 | Length:9659 | Length:9659 | Length:9659 |
| Q1 | Class :character | Class :character | 1st Qu.:4.000 | 1st Qu.: 25 | Class :character | Class :character | 1st Qu.: 0.000 | Class :character | Class :character | Class :character | Class :character | Class :character |
| Median | Mode :character | Mode :character | Median :4.200 | Median : 967 | Mode :character | Mode :character | Median : 0.000 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character |
| Mean | NA | NA | Mean :4.173 | Mean : 216593 | NA | NA | Mean : 1.099 | NA | NA | NA | NA | NA |
| Q3 | NA | NA | 3rd Qu.:4.500 | 3rd Qu.: 29401 | NA | NA | 3rd Qu.: 0.000 | NA | NA | NA | NA | NA |
| Max | NA | NA | Max. :5.000 | Max. :78158306 | NA | NA | Max. :400.000 | NA | NA | NA | NA | NA |
Now there are no missing values
##Checking for Outliers For rating by seeing frequency for each rating
breaks = seq(15,20,by = 1)
frequency_table = table(data_clean1$Rating)
frequency_table
##
## 1 1.2 1.4 1.5
## 16 1 3 3
## 1.6 1.7 1.8 1.9
## 4 8 8 11
## 2 2.1 2.2 2.3
## 12 8 14 20
## 2.4 2.5 2.6 2.7
## 19 20 24 23
## 2.8 2.9 3 3.1
## 40 45 81 69
## 3.2 3.3 3.4 3.5
## 63 100 126 156
## 3.6 3.7 3.8 3.9
## 167 224 286 359
## 4 4.1 4.17324304538799 4.2
## 513 621 1463 810
## 4.3 4.4 4.5 4.6
## 897 895 848 683
## 4.7 4.8 4.9 5
## 442 221 85 271
From above it can be seen all the rating are between 1 and 5
##Visualising the Rating Distribution
boxplot(data_clean1$Rating,ylab = "Rating", xlab = "Count",col = "Blue")
hist(data_clean1$Rating, main="Histogram of Apps Rating after cleaning", xlab="Rating (count)", col = 'blue', breaks = 100 )
qqnorm(data_clean1$Rating)
qqline(data_clean$Rating, col = "red")
Here, it could be seen the plots are much clearer but still skewed due
to other outliers from 1-3 rating but as these may be the reason from
which we could find why the apps are low rated hencecannot be removed
from our dataset.
##Plotting for Reviews
boxplot(data_clean1$Reviews,ylab = "Reviews", xlab = "Count",col = 'Blue')
hist(data_clean1$Reviews, main="Histogram of Apps Reviews", xlab="Reviews (count)", col = 'blue', breaks = 100 )
ggplot(data_clean1, aes(x = log(Reviews))) +
geom_histogram(binwidth = 0.1, fill = "blue", color = "black") +
labs(title = "Log-Transformed Histogram of Ratings", x = "Log(Rating)", y = "Count")
qqnorm(data_clean1$Reviews)
qqline(data_clean1$Reviews, col = "red")
Similar to the case of ratings the plots are skewed due to the outliers. Hence, we can use the log plot of reviews for the visualisation which is normalised version of Reviews. As they are skewed, they donot follow normal distribution
##Review frequency table
xkablesummary(data_clean1)
| App | Category | Rating | Reviews | Size | Installs | Price | Content.Rating | Genres | Last.Updated | Current.Ver | Android.Ver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Min | Length:9659 | Length:9659 | Min. :1.000 | Min. : 0 | Length:9659 | Length:9659 | Min. : 0.000 | Length:9659 | Length:9659 | Length:9659 | Length:9659 | Length:9659 |
| Q1 | Class :character | Class :character | 1st Qu.:4.000 | 1st Qu.: 25 | Class :character | Class :character | 1st Qu.: 0.000 | Class :character | Class :character | Class :character | Class :character | Class :character |
| Median | Mode :character | Mode :character | Median :4.200 | Median : 967 | Mode :character | Mode :character | Median : 0.000 | Mode :character | Mode :character | Mode :character | Mode :character | Mode :character |
| Mean | NA | NA | Mean :4.173 | Mean : 216593 | NA | NA | Mean : 1.099 | NA | NA | NA | NA | NA |
| Q3 | NA | NA | 3rd Qu.:4.500 | 3rd Qu.: 29401 | NA | NA | 3rd Qu.: 0.000 | NA | NA | NA | NA | NA |
| Max | NA | NA | Max. :5.000 | Max. :78158306 | NA | NA | Max. :400.000 | NA | NA | NA | NA | NA |
outlierKD2(data_clean1,Reviews)
## Outliers identified: 1656
## Proportion (%) of outliers: 20.7
## Mean of the outliers: 1228141
## Mean without removing outliers: 216592.6
## Mean if we remove outliers: 7280.61
## Nothing changed
To check which are outliers lets make sections of data that is create
bins to check which bins have maximum data, this would help us see how
reviews are distributed
##Binned reviews
Binning into equal count in each bin to check averge rating for each bin
# Define the new custom breaks for bins
# Ensure there are no NA values
# Define new breaks for more even intervals
breaks <- c(0, 100, 500, 1000, 2500, 5000, 10000, 25000,50000,100000, 300000,1000000,Inf)
# Create a categorical variable based on the new breaks
Review_Category <- cut(data_clean1$Reviews, breaks = breaks, right = FALSE,
labels = c("0+","100+", "500+", "1K+",
"2.5K+", "5K+", "10K+","25K+",
"50K+", "100K+","300K+","1M+"))
# Count the number of values in each bin
bin_counts <- as.data.frame(table(Review_Category))
# Rename the columns for clarity
colnames(bin_counts) <- c("Review_Category", "Count")
# Print the counts
print(bin_counts)
## Review_Category Count
## 1 0+ 3327
## 2 100+ 1065
## 3 500+ 462
## 4 1K+ 586
## 5 2.5K+ 475
## 6 5K+ 474
## 7 10K+ 719
## 8 25K+ 606
## 9 50K+ 498
## 10 100K+ 647
## 11 300K+ 451
## 12 1M+ 349
# Create a line plot of the binned counts
ggplot(bin_counts, aes(x = Review_Category, y = Count, group = 1)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "blue", size = 3) +
labs(title = "Count of Reviews by Review Category",
x = "Review Category",
y = "Count of Reviews") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
Hence, high reviews can be observed in less apps and less reviews can be
observed in more apps which is expected
boxplot( data_clean1$Rating~ Review_Category, data = data_clean1,
main = "Boxplot of Review Counts by Review Category",
xlab = "Review Category",
ylab = "Review Rating",
las = 2, # Rotate the x-axis labels for readability
col = "lightblue") # Optional: Set color for the boxplots
In this we could observe that, as reviews increase the median of rating
increased and the values clustered around higher ratings which could
show that high reviews, could mean a better rated app.
library(dplyr)
# Calculate the mean Rating for each Review_Category
mean_ratings <- tapply(data_clean1$Rating, Review_Category, mean, na.rm = TRUE)
# Convert the result to a data frame for better readability
mean_ratings_df <- data.frame(Review_Category = names(mean_ratings), Mean_Rating = as.numeric(mean_ratings))
# Print the mean ratings for each review bin
print(mean_ratings_df)
## Review_Category Mean_Rating
## 1 0+ 4.126221
## 2 100+ 4.029538
## 3 500+ 4.063188
## 4 1K+ 4.107030
## 5 2.5K+ 4.129572
## 6 5K+ 4.191139
## 7 10K+ 4.221836
## 8 25K+ 4.231848
## 9 50K+ 4.293775
## 10 100K+ 4.329830
## 11 300K+ 4.375610
## 12 1M+ 4.426361
# Define correct order of Review_Category as a factor
mean_ratings_df$Review_Category <- factor(mean_ratings_df$Review_Category,
levels = c("0+","100+", "500+", "1K+",
"2.5K+", "5K+", "10K+","25K+",
"50K+", "100K+", "300K+", "1M+"))
# Plot the mean ratings for each review bin in the correct order
ggplot(mean_ratings_df, aes(x = Review_Category, y = Mean_Rating)) +
geom_bar(stat = "identity", fill = "steelblue") + # Use bar plot
labs(title = "Mean Rating by Review Category",
x = "Review Category",
y = "Mean Rating") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
As we can see, the mean rating increases as the reviews increase.
##Histogram of Reviews and Rating
# Create a new data frame for plotting
plot_data <- data.frame(Rating = data_clean1$Rating, Review_Category = Review_Category)
# Create a histogram of Ratings, faceted by Review_Category
ggplot(plot_data, aes(x = Rating)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
facet_wrap(~ Review_Category, labeller = label_wrap_gen()) + # Facet by Review_Category
theme_minimal() +
labs(title = "Histograms of Ratings by Review Category", x = "Rating", y = "Frequency")
This is another representation of ratings vs reviews
The tests below are to test whether or not different categories have different average ratings.
anova_result <- aov(Rating ~ as.factor(Review_Category), data = data_clean1)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(Review_Category) 11 106.3 9.662 41.36 <2e-16 ***
## Residuals 9647 2253.6 0.234
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
According to p-value, it is significant hence we can say that the average rating for all review categories is not same
# Perform Tukey's HSD
tukey_result <- TukeyHSD(anova_result)
tukey_result
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Rating ~ as.factor(Review_Category), data = data_clean1)
##
## $`as.factor(Review_Category)`
## diff lwr upr p adj
## 100+-0+ -0.096683215 -0.152307271 -0.04105916 0.0000009
## 500+-0+ -0.063032835 -0.141474646 0.01540898 0.2646281
## 1K+-0+ -0.019190832 -0.089971134 0.05158947 0.9992526
## 2.5K+-0+ 0.003350463 -0.074143085 0.08084401 1.0000000
## 5K+-0+ 0.064918154 -0.012646893 0.14248320 0.2087515
## 10K+-0+ 0.095614797 0.030638525 0.16059107 0.0000973
## 25K+-0+ 0.105627098 0.035846939 0.17540726 0.0000488
## 50K+-0+ 0.167554014 0.091642554 0.24346547 0.0000000
## 100K+-0+ 0.203608898 0.135724795 0.27149300 0.0000000
## 300K+-0+ 0.249388670 0.170111342 0.32866600 0.0000000
## 1M+-0+ 0.300139945 0.211244127 0.38903576 0.0000000
## 500+-100+ 0.033650380 -0.054364565 0.12166533 0.9848292
## 1K+-100+ 0.077492383 -0.003768703 0.15875347 0.0784345
## 2.5K+-100+ 0.100033678 0.012862795 0.18720456 0.0096675
## 5K+-100+ 0.161601369 0.074366918 0.24883582 0.0000001
## 10K+-100+ 0.192298012 0.116039053 0.26855697 0.0000000
## 25K+-100+ 0.202310313 0.121918874 0.28270175 0.0000000
## 50K+-100+ 0.264237229 0.178469737 0.35000472 0.0000000
## 100K+-100+ 0.300292113 0.221540831 0.37904339 0.0000000
## 300K+-100+ 0.346071885 0.257311491 0.43483228 0.0000000
## 1M+-100+ 0.396823160 0.299375844 0.49427048 0.0000000
## 1K+-500+ 0.043842003 -0.054455739 0.14213974 0.9515761
## 2.5K+-500+ 0.066383298 -0.036853541 0.16962014 0.6214468
## 5K+-500+ 0.127950989 0.024660470 0.23124151 0.0030189
## 10K+-500+ 0.158647632 0.064443010 0.25285225 0.0000025
## 25K+-500+ 0.168659933 0.071079887 0.26623998 0.0000011
## 50K+-500+ 0.230586849 0.128532233 0.33264146 0.0000000
## 100K+-500+ 0.266641733 0.170408442 0.36287502 0.0000000
## 300K+-500+ 0.312421505 0.207839051 0.41700396 0.0000000
## 1M+-500+ 0.363172780 0.251123410 0.47522215 0.0000000
## 2.5K+-1K+ 0.022541295 -0.075001405 0.12008400 0.9998394
## 5K+-1K+ 0.084108986 -0.013490527 0.18170850 0.1727899
## 10K+-1K+ 0.114805629 0.026878134 0.20273312 0.0012014
## 25K+-1K+ 0.124817930 0.033283243 0.21635262 0.0005180
## 50K+-1K+ 0.186744846 0.090454254 0.28303544 0.0000000
## 100K+-1K+ 0.222799730 0.132702117 0.31289734 0.0000000
## 300K+-1K+ 0.268579502 0.169613735 0.36754527 0.0000000
## 1M+-1K+ 0.319330777 0.212504774 0.42615678 0.0000000
## 5K+-2.5K+ 0.061567691 -0.041004546 0.16413993 0.7193424
## 10K+-2.5K+ 0.092264334 -0.001152170 0.18568084 0.0565429
## 25K+-2.5K+ 0.102276635 0.005457227 0.19909604 0.0276896
## 50K+-2.5K+ 0.164203551 0.062875978 0.26553112 0.0000078
## 100K+-2.5K+ 0.200258435 0.104796512 0.29572036 0.0000000
## 300K+-2.5K+ 0.246038206 0.142165102 0.34991131 0.0000000
## 1M+-2.5K+ 0.296789482 0.185401898 0.40817707 0.0000000
## 10K+-5K+ 0.030696643 -0.062779181 0.12417247 0.9957463
## 25K+-5K+ 0.040708944 -0.056167701 0.13758559 0.9685508
## 50K+-5K+ 0.102635860 0.001253596 0.20401812 0.0440982
## 100K+-5K+ 0.138690744 0.043170771 0.23421072 0.0001331
## 300K+-5K+ 0.184470516 0.080544059 0.28839697 0.0000004
## 1M+-5K+ 0.235221791 0.123784453 0.34665913 0.0000000
## 25K+-10K+ 0.010012302 -0.077112114 0.09713672 0.9999999
## 50K+-10K+ 0.071939217 -0.020169104 0.16404754 0.3070668
## 100K+-10K+ 0.107994101 0.022380758 0.19360745 0.0022235
## 300K+-10K+ 0.153773873 0.058872409 0.24867534 0.0000078
## 1M+-10K+ 0.204525148 0.101453039 0.30759726 0.0000000
## 50K+-25K+ 0.061926916 -0.033630908 0.15748474 0.6094814
## 100K+-25K+ 0.097981800 0.008667751 0.18729585 0.0175649
## 300K+-25K+ 0.143761571 0.045508620 0.24201452 0.0001113
## 1M+-25K+ 0.194512847 0.088346871 0.30067882 0.0000001
## 100K+-50K+ 0.036054884 -0.058127272 0.13023704 0.9846717
## 300K+-50K+ 0.081834656 -0.020863551 0.18453286 0.2768896
## 1M+-50K+ 0.132585931 0.022293168 0.24287869 0.0048805
## 300K+-100K+ 0.045779772 -0.051135776 0.14269532 0.9282456
## 1M+-100K+ 0.096531047 -0.008398431 0.20146052 0.1064662
## 1M+-300K+ 0.050751275 -0.061884591 0.16338714 0.9479902
# Convert the result to a data frame
tukey_df <- as.data.frame(tukey_result$`as.factor(Review_Category)`)
# Filter for significant p-values
significant_tukey <- tukey_df[tukey_df[4] < 0.05, ]
# Display the significant results
print(significant_tukey)
## diff lwr upr p adj
## 100+-0+ -0.09668322 -0.152307271 -0.04105916 8.987756e-07
## 10K+-0+ 0.09561480 0.030638525 0.16059107 9.732720e-05
## 25K+-0+ 0.10562710 0.035846939 0.17540726 4.884843e-05
## 50K+-0+ 0.16755401 0.091642554 0.24346547 0.000000e+00
## 100K+-0+ 0.20360890 0.135724795 0.27149300 0.000000e+00
## 300K+-0+ 0.24938867 0.170111342 0.32866600 0.000000e+00
## 1M+-0+ 0.30013994 0.211244127 0.38903576 0.000000e+00
## 2.5K+-100+ 0.10003368 0.012862795 0.18720456 9.667490e-03
## 5K+-100+ 0.16160137 0.074366918 0.24883582 9.538328e-08
## 10K+-100+ 0.19229801 0.116039053 0.26855697 0.000000e+00
## 25K+-100+ 0.20231031 0.121918874 0.28270175 0.000000e+00
## 50K+-100+ 0.26423723 0.178469737 0.35000472 0.000000e+00
## 100K+-100+ 0.30029211 0.221540831 0.37904339 0.000000e+00
## 300K+-100+ 0.34607188 0.257311491 0.43483228 0.000000e+00
## 1M+-100+ 0.39682316 0.299375844 0.49427048 0.000000e+00
## 5K+-500+ 0.12795099 0.024660470 0.23124151 3.018884e-03
## 10K+-500+ 0.15864763 0.064443010 0.25285225 2.473396e-06
## 25K+-500+ 0.16865993 0.071079887 0.26623998 1.080775e-06
## 50K+-500+ 0.23058685 0.128532233 0.33264146 0.000000e+00
## 100K+-500+ 0.26664173 0.170408442 0.36287502 0.000000e+00
## 300K+-500+ 0.31242150 0.207839051 0.41700396 0.000000e+00
## 1M+-500+ 0.36317278 0.251123410 0.47522215 0.000000e+00
## 10K+-1K+ 0.11480563 0.026878134 0.20273312 1.201416e-03
## 25K+-1K+ 0.12481793 0.033283243 0.21635262 5.179950e-04
## 50K+-1K+ 0.18674485 0.090454254 0.28303544 1.572425e-08
## 100K+-1K+ 0.22279973 0.132702117 0.31289734 0.000000e+00
## 300K+-1K+ 0.26857950 0.169613735 0.36754527 0.000000e+00
## 1M+-1K+ 0.31933078 0.212504774 0.42615678 0.000000e+00
## 25K+-2.5K+ 0.10227664 0.005457227 0.19909604 2.768961e-02
## 50K+-2.5K+ 0.16420355 0.062875978 0.26553112 7.808701e-06
## 100K+-2.5K+ 0.20025843 0.104796512 0.29572036 3.507883e-10
## 300K+-2.5K+ 0.24603821 0.142165102 0.34991131 0.000000e+00
## 1M+-2.5K+ 0.29678948 0.185401898 0.40817707 0.000000e+00
## 50K+-5K+ 0.10263586 0.001253596 0.20401812 4.409823e-02
## 100K+-5K+ 0.13869074 0.043170771 0.23421072 1.331239e-04
## 300K+-5K+ 0.18447052 0.080544059 0.28839697 4.428778e-07
## 1M+-5K+ 0.23522179 0.123784453 0.34665913 2.244942e-10
## 100K+-10K+ 0.10799410 0.022380758 0.19360745 2.223466e-03
## 300K+-10K+ 0.15377387 0.058872409 0.24867534 7.832139e-06
## 1M+-10K+ 0.20452515 0.101453039 0.30759726 5.942656e-09
## 100K+-25K+ 0.09798180 0.008667751 0.18729585 1.756493e-02
## 300K+-25K+ 0.14376157 0.045508620 0.24201452 1.113055e-04
## 1M+-25K+ 0.19451285 0.088346871 0.30067882 1.436204e-07
## 1M+-50K+ 0.13258593 0.022293168 0.24287869 4.880458e-03
As we can see, the significant difference for average rating for different review categories is between 0+ and 1M+ as expected.
##For easier Ratings and Reviews vs Installs we can group Installs into categories given
# Load necessary libraries
library(ggplot2)
# Step 1: Identify the unique values in the 'Installs' column
unique_values <- unique(data_clean1$Installs)
# Function to convert installs to numeric
convert_to_numeric <- function(x) {
# Remove non-numeric characters and convert to numeric
as.numeric(gsub("[^0-9]", "", x)) * 10^(length(gregexpr(",", x)[[1]]) - 1)
}
# Sort unique values based on the custom numeric conversion
sorted_values <- unique_values[order(sapply(unique_values, convert_to_numeric))]
# Create a bar plot with the ordered factor without adding a new column
ggplot(data = data_clean1, aes(x = factor(Installs, levels = sorted_values))) +
geom_bar(fill = "blue", alpha = 0.7) +
xlab("Installs") +
ylab("Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate x-axis labels for readability
ggtitle("Distribution of App Installs")
##Ratings vs Installs
Now we can check what is the average rating for each Install category and what is the relationship between them
# Load necessary libraries
library(ggplot2)
library(dplyr)
# Function to convert installs to numeric
convert_to_numeric <- function(x) {
as.numeric(gsub("[^0-9]", "", x)) * 10^(length(gregexpr(",", x)[[1]]) - 1)
}
# Step 1: Calculate mean ratings and counts for each install category using dplyr
data_mean <- data_clean1 %>%
group_by(Installs) %>%
summarise(Mean_Rating = mean(Rating, na.rm = TRUE), Count = n()) %>%
ungroup()
# Sort install categories
sorted_installs <- data_mean$Installs[order(sapply(data_mean$Installs, convert_to_numeric))]
# Create dot plot with size based on the count of ratings
ggplot(data_mean, aes(x = factor(Installs, levels = sorted_installs), y = Mean_Rating)) +
geom_point(aes(size = Count), color = "blue", alpha = 0.7) + # Size based on count of ratings
geom_segment(aes(x = factor(Installs, levels = sorted_installs),
xend = factor(Installs, levels = sorted_installs),
y = 0, yend = Mean_Rating), color = "grey", linetype = "dashed") +
labs(title = "Mean Ratings by Install Category", x = "Install Categories", y = "Mean Ratings") +
scale_size_continuous(name = "Number of Ratings") + # Add legend for size
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
The analysis reveals that both low and high install counts correspond to high ratings. However, apps with a greater number of installs and high ratings are generally regarded as superior, as indicated by the density of the dots in the plot, which reflects the volume of ratings they have received
length(unique(data_clean$Category))
## [1] 33
length(unique(data_clean$Genres))
## [1] 118
There are 34 categories in the the dataframe with 119 genres. This means that in each category, there are multiple genres. Given that, the later analyses in this project can be proceeded with Category variable.
Below is the graph for the distribution of Categories for the dataset after removing duplicates.
#Distribution for Category
category_counts <- table(data_clean$Category)
# Convert to data frame for plotting
category_counts_df <- as.data.frame(category_counts)
colnames(category_counts_df) <- c("Category", "Frequency")
ggplot(category_counts_df, aes(x = reorder(Category, Frequency), y = Frequency)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_text(aes(label = Frequency), vjust = 0.5, hjust=1, size=2.5) +
coord_flip() +
labs(title = "Distribution of Categories", x = "Category", y = "Frequency") +
theme_minimal() +
theme(axis.text.y = element_text(size = 5.5))
## Category vs. Installs
library(DT)
#clean installations
clean_installs <- function(Installs) {
Installs <- gsub("\\+", "", Installs) # Remove the '+' sign
Installs <- gsub(",", "", Installs) # Remove the commas
return(as.numeric(Installs)) # Convert to numeric
}
data_clean$Installs <- sapply(data_clean$Installs, clean_installs)
nan_rows <- sapply(data_clean[, c("Size", "Installs")], function(x) any(is.nan(x)))
# Display only rows that contain NaN in either Size or Installs
data_clean[,nan_rows]
## data frame with 0 columns and 9659 rows
datatable((data_clean), options = list(scrollX = TRUE ))
# Step 1: Identify the unique values in the 'Installs' column
unique_values <- unique(data_clean1$Installs)
# Display the unique values
print(unique_values)
## [1] "10,000+" "500,000+" "5,000,000+" "50,000,000+"
## [5] "100,000+" "50,000+" "1,000,000+" "10,000,000+"
## [9] "5,000+" "100,000,000+" "1,000,000,000+" "1,000+"
## [13] "500,000,000+" "50+" "100+" "500+"
## [17] "10+" "1+" "5+" "0+"
## [21] "0"
# Function to convert the installs to numeric
convert_to_numeric <- function(x) {
# Remove non-numeric characters and convert to numeric
as.numeric(gsub("[^0-9]", "", x)) * 10^(length(gregexpr(",", x)[[1]]) - 1)
}
# Sort unique values based on the custom numeric conversion
sorted_values <- unique_values[order(sapply(unique_values, convert_to_numeric))]
# Create a new data frame to store the factor levels
data_clean1_factor <- data_clean1 # Assuming you want to keep the original data intact
data_clean1_factor$Installs <- factor(data_clean1$Installs, levels = sorted_values)
# Create a bar plot with the ordered factor
ggplot(data_clean1_factor, aes(x = Installs)) +
geom_bar() +
xlab("Installs") +
ylab("Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate x-axis labels for readability
ggtitle("Distribution of App Installs")
# Scatter plot for Installs vs Reviews
ggplot(data_clean1_factor, aes(x = Reviews, y = Installs)) +
geom_point(color = "blue", alpha = 0.5) +
labs(title = "Scatter Plot of Installs vs Reviews",
x = "Number of Reviews",
y = "Number of Installs") +
theme_minimal()
# Log-transform the Installs
data_clean$log_Installs <- log(data_clean$Installs)
# Scatter plot of log-transformed Installs vs. Rating
ggplot(data_clean, aes(x = log_Installs, y = Rating)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", color = "red", se = FALSE) + # Add a regression line
labs(title = "Log-Transformed Installs vs. Rating",
x = "Log(Installs)",
y = "Rating") +
theme_minimal()